R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(Metrics)
library(readr)
library(ggplot2)#for visualisation
library(corrplot)#for visualisation of correlation
## corrplot 0.92 loaded
library(mlbench) 
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(plotly)#converting ggplot to plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
library(lattice)
library(caret)
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(caTools)#for splittind data into testing and training data
library(dplyr) #manipulating dataframe
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mlbench)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_csv("C:/Users/FD_gi/Documents/Regression lineal/data/kc-house-data.csv")
## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): id
## dbl  (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(data)
## spc_tbl_ [21,613 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id           : chr [1:21613] "7129300520" "6414100192" "5631500400" "2487200875" ...
##  $ date         : POSIXct[1:21613], format: "2014-10-13" "2014-12-09" ...
##  $ price        : num [1:21613] 221900 538000 180000 604000 510000 ...
##  $ bedrooms     : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num [1:21613] 1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num [1:21613] 5650 7242 10000 5000 8080 ...
##  $ floors       : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num [1:21613] 1180 2170 770 1050 1680 ...
##  $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num [1:21613] 1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
##  $ zipcode      : num [1:21613] 98178 98125 98028 98136 98074 ...
##  $ lat          : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num [1:21613] -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num [1:21613] 5650 7639 8062 5000 7503 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   date = col_datetime(format = ""),
##   ..   price = col_double(),
##   ..   bedrooms = col_double(),
##   ..   bathrooms = col_double(),
##   ..   sqft_living = col_double(),
##   ..   sqft_lot = col_double(),
##   ..   floors = col_double(),
##   ..   waterfront = col_double(),
##   ..   view = col_double(),
##   ..   condition = col_double(),
##   ..   grade = col_double(),
##   ..   sqft_above = col_double(),
##   ..   sqft_basement = col_double(),
##   ..   yr_built = col_double(),
##   ..   yr_renovated = col_double(),
##   ..   zipcode = col_double(),
##   ..   lat = col_double(),
##   ..   long = col_double(),
##   ..   sqft_living15 = col_double(),
##   ..   sqft_lot15 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
dim(data)
## [1] 21613    21
missmap(data,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.

#Checking for NA and missing values and removing them.
numberOfNA <- length(which(is.na(data)==T))
numberOfNA
## [1] 0
# Remove NA values
data <- data %>%
        drop_na()


str(data) 
## tibble [21,613 × 21] (S3: tbl_df/tbl/data.frame)
##  $ id           : chr [1:21613] "7129300520" "6414100192" "5631500400" "2487200875" ...
##  $ date         : POSIXct[1:21613], format: "2014-10-13" "2014-12-09" ...
##  $ price        : num [1:21613] 221900 538000 180000 604000 510000 ...
##  $ bedrooms     : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num [1:21613] 1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num [1:21613] 5650 7242 10000 5000 8080 ...
##  $ floors       : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num [1:21613] 1180 2170 770 1050 1680 ...
##  $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num [1:21613] 1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
##  $ zipcode      : num [1:21613] 98178 98125 98028 98136 98074 ...
##  $ lat          : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num [1:21613] -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num [1:21613] 5650 7639 8062 5000 7503 ...
dim(data)
## [1] 21613    21
data <- data %>% dplyr::select(-c(date))

#Drop  id
data$id <- NULL

library(corrplot)


str(data)
## tibble [21,613 × 19] (S3: tbl_df/tbl/data.frame)
##  $ price        : num [1:21613] 221900 538000 180000 604000 510000 ...
##  $ bedrooms     : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num [1:21613] 1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num [1:21613] 5650 7242 10000 5000 8080 ...
##  $ floors       : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num [1:21613] 1180 2170 770 1050 1680 ...
##  $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num [1:21613] 1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
##  $ zipcode      : num [1:21613] 98178 98125 98028 98136 98074 ...
##  $ lat          : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num [1:21613] -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num [1:21613] 5650 7639 8062 5000 7503 ...
corrplot(cor(data))

corrplot(cor(data),method='number')

# Highly correlated variables
correlated <- cor(data)
##The caret findCorrelation evaluates pair-wise correlations across all variables, flagging variables that are highly correlated. Of the identified pairs, the function recommends the removal of the variable with the highest average absolute correlation across the dataset.
highCorr <- findCorrelation(correlated, cutoff=0.80)
highCorr
## [1] 4
names(data[highCorr])
## [1] "sqft_living"
summary(data)
##      price            bedrooms        bathrooms      sqft_living   
##  Min.   :  75000   Min.   : 0.000   Min.   :0.000   Min.   :  290  
##  1st Qu.: 321950   1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427  
##  Median : 450000   Median : 3.000   Median :2.250   Median : 1910  
##  Mean   : 540088   Mean   : 3.371   Mean   :2.115   Mean   : 2080  
##  3rd Qu.: 645000   3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550  
##  Max.   :7700000   Max.   :33.000   Max.   :8.000   Max.   :13540  
##     sqft_lot           floors        waterfront            view       
##  Min.   :    520   Min.   :1.000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   5040   1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   7618   Median :1.500   Median :0.000000   Median :0.0000  
##  Mean   :  15107   Mean   :1.494   Mean   :0.007542   Mean   :0.2343  
##  3rd Qu.:  10688   3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :1651359   Max.   :3.500   Max.   :1.000000   Max.   :4.0000  
##    condition         grade          sqft_above   sqft_basement   
##  Min.   :1.000   Min.   : 1.000   Min.   : 290   Min.   :   0.0  
##  1st Qu.:3.000   1st Qu.: 7.000   1st Qu.:1190   1st Qu.:   0.0  
##  Median :3.000   Median : 7.000   Median :1560   Median :   0.0  
##  Mean   :3.409   Mean   : 7.657   Mean   :1788   Mean   : 291.5  
##  3rd Qu.:4.000   3rd Qu.: 8.000   3rd Qu.:2210   3rd Qu.: 560.0  
##  Max.   :5.000   Max.   :13.000   Max.   :9410   Max.   :4820.0  
##     yr_built     yr_renovated       zipcode           lat       
##  Min.   :1900   Min.   :   0.0   Min.   :98001   Min.   :47.16  
##  1st Qu.:1951   1st Qu.:   0.0   1st Qu.:98033   1st Qu.:47.47  
##  Median :1975   Median :   0.0   Median :98065   Median :47.57  
##  Mean   :1971   Mean   :  84.4   Mean   :98078   Mean   :47.56  
##  3rd Qu.:1997   3rd Qu.:   0.0   3rd Qu.:98118   3rd Qu.:47.68  
##  Max.   :2015   Max.   :2015.0   Max.   :98199   Max.   :47.78  
##       long        sqft_living15    sqft_lot15    
##  Min.   :-122.5   Min.   : 399   Min.   :   651  
##  1st Qu.:-122.3   1st Qu.:1490   1st Qu.:  5100  
##  Median :-122.2   Median :1840   Median :  7620  
##  Mean   :-122.2   Mean   :1987   Mean   : 12768  
##  3rd Qu.:-122.1   3rd Qu.:2360   3rd Qu.: 10083  
##  Max.   :-121.3   Max.   :6210   Max.   :871200
# Remove all auxiliary information and data transformation (num -> factor)
factorfun <- function(x){
      x <- as.factor(x)
}
newdata <- data %>% mutate_at(c("waterfront", "view", "condition"), factorfun)

# Replace variables of yr_built and yr_renovated with age 
for (i in 1:nrow(newdata)){
  if(newdata$yr_renovated[i]!=0)
    newdata$yr_built[i] <- newdata$yr_renovated[i]
}
newdata$age <- 2020 - newdata$yr_built

newdata <- newdata %>% dplyr::select(price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, 
                              view, condition, age, lat, long)


str(newdata)
## tibble [21,613 × 12] (S3: tbl_df/tbl/data.frame)
##  $ price      : num [1:21613] 221900 538000 180000 604000 510000 ...
##  $ bedrooms   : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms  : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living: num [1:21613] 1180 2570 770 1960 1680 ...
##  $ sqft_lot   : num [1:21613] 5650 7242 10000 5000 8080 ...
##  $ floors     : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ view       : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ condition  : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 5 3 3 3 3 3 3 ...
##  $ age        : num [1:21613] 65 29 87 55 33 19 25 57 60 17 ...
##  $ lat        : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
##  $ long       : num [1:21613] -122 -122 -122 -122 -122 ...
dim(newdata)
## [1] 21613    12
## Remove any missings.
newdata <- newdata[rowSums(is.na(newdata))==0,]
dim(newdata)
## [1] 21613    12
library(corrplot)


str(data)
## tibble [21,613 × 19] (S3: tbl_df/tbl/data.frame)
##  $ price        : num [1:21613] 221900 538000 180000 604000 510000 ...
##  $ bedrooms     : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num [1:21613] 1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num [1:21613] 5650 7242 10000 5000 8080 ...
##  $ floors       : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num [1:21613] 1180 2170 770 1050 1680 ...
##  $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num [1:21613] 1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
##  $ zipcode      : num [1:21613] 98178 98125 98028 98136 98074 ...
##  $ lat          : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num [1:21613] -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num [1:21613] 5650 7639 8062 5000 7503 ...
corrplot(cor(data))

corrplot(cor(data),method='number')

# Highly correlated variables
correlated <- cor(data)
##The caret findCorrelation evaluates pair-wise correlations across all variables, flagging variables that are highly correlated. Of the identified pairs, the function recommends the removal of the variable with the highest average absolute correlation across the dataset.
highCorr <- findCorrelation(correlated, cutoff=0.80)
highCorr
## [1] 4
names(data[highCorr])
## [1] "sqft_living"
summary(data)
##      price            bedrooms        bathrooms      sqft_living   
##  Min.   :  75000   Min.   : 0.000   Min.   :0.000   Min.   :  290  
##  1st Qu.: 321950   1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427  
##  Median : 450000   Median : 3.000   Median :2.250   Median : 1910  
##  Mean   : 540088   Mean   : 3.371   Mean   :2.115   Mean   : 2080  
##  3rd Qu.: 645000   3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550  
##  Max.   :7700000   Max.   :33.000   Max.   :8.000   Max.   :13540  
##     sqft_lot           floors        waterfront            view       
##  Min.   :    520   Min.   :1.000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   5040   1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   7618   Median :1.500   Median :0.000000   Median :0.0000  
##  Mean   :  15107   Mean   :1.494   Mean   :0.007542   Mean   :0.2343  
##  3rd Qu.:  10688   3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :1651359   Max.   :3.500   Max.   :1.000000   Max.   :4.0000  
##    condition         grade          sqft_above   sqft_basement   
##  Min.   :1.000   Min.   : 1.000   Min.   : 290   Min.   :   0.0  
##  1st Qu.:3.000   1st Qu.: 7.000   1st Qu.:1190   1st Qu.:   0.0  
##  Median :3.000   Median : 7.000   Median :1560   Median :   0.0  
##  Mean   :3.409   Mean   : 7.657   Mean   :1788   Mean   : 291.5  
##  3rd Qu.:4.000   3rd Qu.: 8.000   3rd Qu.:2210   3rd Qu.: 560.0  
##  Max.   :5.000   Max.   :13.000   Max.   :9410   Max.   :4820.0  
##     yr_built     yr_renovated       zipcode           lat       
##  Min.   :1900   Min.   :   0.0   Min.   :98001   Min.   :47.16  
##  1st Qu.:1951   1st Qu.:   0.0   1st Qu.:98033   1st Qu.:47.47  
##  Median :1975   Median :   0.0   Median :98065   Median :47.57  
##  Mean   :1971   Mean   :  84.4   Mean   :98078   Mean   :47.56  
##  3rd Qu.:1997   3rd Qu.:   0.0   3rd Qu.:98118   3rd Qu.:47.68  
##  Max.   :2015   Max.   :2015.0   Max.   :98199   Max.   :47.78  
##       long        sqft_living15    sqft_lot15    
##  Min.   :-122.5   Min.   : 399   Min.   :   651  
##  1st Qu.:-122.3   1st Qu.:1490   1st Qu.:  5100  
##  Median :-122.2   Median :1840   Median :  7620  
##  Mean   :-122.2   Mean   :1987   Mean   : 12768  
##  3rd Qu.:-122.1   3rd Qu.:2360   3rd Qu.: 10083  
##  Max.   :-121.3   Max.   :6210   Max.   :871200
#Let’s split the loaded dataset into train and test sets. We will use 80% of the data to train our models and 20% will be used to test the models..

set.seed(123)
ind <- sample(2, nrow(newdata), prob = c(0.8, 0.2), replace = T)
train <- newdata[ind == 1, ]
test <- newdata[ind == 2,]

dim(newdata)
## [1] 21613    12
dim(train)
## [1] 17346    12
dim(test)
## [1] 4267   12
data
## # A tibble: 21,613 × 19
##      price bedrooms bathrooms sqft_living sqft_lot floors waterfront  view
##      <dbl>    <dbl>     <dbl>       <dbl>    <dbl>  <dbl>      <dbl> <dbl>
##  1  221900        3      1           1180     5650      1          0     0
##  2  538000        3      2.25        2570     7242      2          0     0
##  3  180000        2      1            770    10000      1          0     0
##  4  604000        4      3           1960     5000      1          0     0
##  5  510000        3      2           1680     8080      1          0     0
##  6 1225000        4      4.5         5420   101930      1          0     0
##  7  257500        3      2.25        1715     6819      2          0     0
##  8  291850        3      1.5         1060     9711      1          0     0
##  9  229500        3      1           1780     7470      1          0     0
## 10  323000        3      2.5         1890     6560      2          0     0
## # ℹ 21,603 more rows
## # ℹ 11 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## #   sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>,
## #   lat <dbl>, long <dbl>, sqft_living15 <dbl>, sqft_lot15 <dbl>
str(train)
## tibble [17,346 × 12] (S3: tbl_df/tbl/data.frame)
##  $ price      : num [1:17346] 221900 538000 180000 1225000 257500 ...
##  $ bedrooms   : num [1:17346] 3 3 2 4 3 3 3 2 3 3 ...
##  $ bathrooms  : num [1:17346] 1 2.25 1 4.5 2.25 1 2.5 1 1 1.75 ...
##  $ sqft_living: num [1:17346] 1180 2570 770 5420 1715 ...
##  $ sqft_lot   : num [1:17346] 5650 7242 10000 101930 6819 ...
##  $ floors     : num [1:17346] 1 2 1 1 2 1 2 1 1.5 1 ...
##  $ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ view       : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ condition  : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 3 3 3 3 4 4 4 ...
##  $ age        : num [1:17346] 65 29 87 19 25 60 17 78 93 43 ...
##  $ lat        : num [1:17346] 47.5 47.7 47.7 47.7 47.3 ...
##  $ long       : num [1:17346] -122 -122 -122 -122 -122 ...
lm_model <- lm(price ~ .,
              data = train)
lm_model
## 
## Call:
## lm(formula = price ~ ., data = train)
## 
## Coefficients:
## (Intercept)     bedrooms    bathrooms  sqft_living     sqft_lot       floors  
##  -5.035e+07   -4.605e+04    3.574e+04    2.727e+02   -3.266e-02    3.629e+04  
## waterfront1        view1        view2        view3        view4   condition2  
##   4.792e+05    1.272e+05    8.924e+04    1.728e+05    3.622e+05   -8.403e+03  
##  condition3   condition4   condition5          age          lat         long  
##  -8.521e+03    2.781e+04    5.558e+04    1.156e+03    6.581e+05   -1.552e+05
summary(lm_model)
## 
## Call:
## lm(formula = price ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1360891  -107996   -10354    80713  4092628 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.035e+07  1.697e+06 -29.673   <2e-16 ***
## bedrooms    -4.605e+04  2.247e+03 -20.499   <2e-16 ***
## bathrooms    3.574e+04  3.889e+03   9.191   <2e-16 ***
## sqft_living  2.727e+02  3.095e+00  88.104   <2e-16 ***
## sqft_lot    -3.266e-02  4.134e-02  -0.790    0.430    
## floors       3.629e+04  3.819e+03   9.504   <2e-16 ***
## waterfront1  4.792e+05  2.460e+04  19.482   <2e-16 ***
## view1        1.272e+05  1.365e+04   9.321   <2e-16 ***
## view2        8.924e+04  8.158e+03  10.938   <2e-16 ***
## view3        1.728e+05  1.117e+04  15.472   <2e-16 ***
## view4        3.622e+05  1.801e+04  20.110   <2e-16 ***
## condition2  -8.403e+03  4.908e+04  -0.171    0.864    
## condition3  -8.521e+03  4.564e+04  -0.187    0.852    
## condition4   2.781e+04  4.564e+04   0.609    0.542    
## condition5   5.558e+04  4.590e+04   1.211    0.226    
## age          1.156e+03  8.479e+01  13.638   <2e-16 ***
## lat          6.581e+05  1.243e+04  52.938   <2e-16 ***
## long        -1.552e+05  1.338e+04 -11.600   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 218000 on 17328 degrees of freedom
## Multiple R-squared:  0.6481, Adjusted R-squared:  0.6478 
## F-statistic:  1877 on 17 and 17328 DF,  p-value: < 2.2e-16
#Predict
pLm <- predict(lm_model,test)
postResample(pLm,test$price)
##         RMSE     Rsquared          MAE 
## 2.134471e+05 6.612866e-01 1.378992e+05
plLinearSimple <-test %>% 
  ggplot(aes(price,pLm)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of price') +
  ylab('Predicted value of price')+
  theme_bw()

ggplotly(plLinearSimple)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#Cross validation
x <- data.matrix(train)
y <- train$price


control <- trainControl(method = "cv",
                        number = 10)

lineerCV <- train(price~.,
                data = train,
                method = "lm",
                trControl = control )
lineerCV
## Linear Regression 
## 
## 17346 samples
##    11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 15610, 15611, 15612, 15611, 15612, 15610, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   218345.4  0.6456408  137295.5
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lineerCV)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1360891  -107996   -10354    80713  4092628 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.035e+07  1.697e+06 -29.673   <2e-16 ***
## bedrooms    -4.605e+04  2.247e+03 -20.499   <2e-16 ***
## bathrooms    3.574e+04  3.889e+03   9.191   <2e-16 ***
## sqft_living  2.727e+02  3.095e+00  88.104   <2e-16 ***
## sqft_lot    -3.266e-02  4.134e-02  -0.790    0.430    
## floors       3.629e+04  3.819e+03   9.504   <2e-16 ***
## waterfront1  4.792e+05  2.460e+04  19.482   <2e-16 ***
## view1        1.272e+05  1.365e+04   9.321   <2e-16 ***
## view2        8.924e+04  8.158e+03  10.938   <2e-16 ***
## view3        1.728e+05  1.117e+04  15.472   <2e-16 ***
## view4        3.622e+05  1.801e+04  20.110   <2e-16 ***
## condition2  -8.403e+03  4.908e+04  -0.171    0.864    
## condition3  -8.521e+03  4.564e+04  -0.187    0.852    
## condition4   2.781e+04  4.564e+04   0.609    0.542    
## condition5   5.558e+04  4.590e+04   1.211    0.226    
## age          1.156e+03  8.479e+01  13.638   <2e-16 ***
## lat          6.581e+05  1.243e+04  52.938   <2e-16 ***
## long        -1.552e+05  1.338e+04 -11.600   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 218000 on 17328 degrees of freedom
## Multiple R-squared:  0.6481, Adjusted R-squared:  0.6478 
## F-statistic:  1877 on 17 and 17328 DF,  p-value: < 2.2e-16
#Predict
pLmCV <- predict(lineerCV,test)
postResample(pLmCV,test$price)
##         RMSE     Rsquared          MAE 
## 2.134471e+05 6.612866e-01 1.378992e+05
plLinearCV<-test %>% 
  ggplot(aes(price,pLmCV)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of price') +
  ylab('Predicted value of price')+
  theme_bw()

ggplotly(plLinearCV)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ridge <- train(price~.,
               data = train,
               method = "glmnet",
               tuneGrid = expand.grid(alpha = 0,
                                      lambda = seq(0.0001,1,length=50)),
               trControl = control )


pRidge <- predict(ridge,test)
postResample(pRidge,test$price)
##         RMSE     Rsquared          MAE 
## 2.138839e+05 6.602660e-01 1.350585e+05
plRidge <-test %>% 
  ggplot(aes(price,pRidge)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of price') +
  ylab('Predicted value of price')+
  theme_bw()

ggplotly(plRidge)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
lasso <- train(price~.,
               data = train,
               method = "glmnet",
               tuneGrid = expand.grid(alpha = 1,
                                      lambda = seq(0.0001,1,length=50)),
               trControl = control )


pLasso <- predict(lasso,test)
postResample(pLasso,test$price)
##         RMSE     Rsquared          MAE 
## 2.134865e+05 6.610694e-01 1.377348e+05
plLasoo <-test %>% 
  ggplot(aes(price,pLasso)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of price') +
  ylab('Predicted value of price')+
  theme_bw()

ggplotly(plLasoo)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
library(catboost)
#Separate x and y of train and test dataset, which will very useful when we using this in the catboost package.
library(dplyr)
y_train <- unlist(train[c('price')])
X_train <- train %>% select(-price)
y_valid <- unlist(test[c('price')])
X_valid <- test %>% select(-price)

#Convert the train and test dataset to catboost specific format using the load_pool function by mentioning x and y of both train and test.
train_pool <- catboost.load_pool(data = X_train, label = y_train)
test_pool <- catboost.load_pool(data = X_valid, label = y_valid)


#Create an input params for the CatBoost regression.
params <- list(iterations=500,
               learning_rate=0.01,
               depth=10,
               loss_function='RMSE',
               eval_metric='RMSE',
               random_seed = 55,
               od_type='Iter',
               metric_period = 50,
               od_wait=20,
               use_best_model=TRUE)

modelCatboost <- catboost.train(learn_pool = train_pool,params = params)
## You should provide test set for use best model. use_best_model parameter has been switched to false value.
## 0:   learn: 364735.1932310   total: 199ms    remaining: 1m 39s
## 50:  learn: 271078.1687112   total: 3.47s    remaining: 30.5s
## 100: learn: 215954.3839468   total: 6.57s    remaining: 26s
## 150: learn: 183005.6666038   total: 9.72s    remaining: 22.5s
## 200: learn: 162997.0175298   total: 13.1s    remaining: 19.4s
## 250: learn: 149479.6936442   total: 16.2s    remaining: 16.1s
## 300: learn: 140688.4749743   total: 19.4s    remaining: 12.8s
## 350: learn: 136098.3140342   total: 21.9s    remaining: 9.29s
## 400: learn: 132910.1449928   total: 23.9s    remaining: 5.9s
## 450: learn: 130175.8951410   total: 26s  remaining: 2.83s
## 499: learn: 127939.0697817   total: 28.2s    remaining: 0us
y_pred=catboost.predict(modelCatboost,test_pool)

catboostMetrics <- postResample(y_pred,test$price)
catboostMetrics
##         RMSE     Rsquared          MAE 
## 1.460674e+05 8.526002e-01 8.405154e+04
plCatboost <-test %>% 
  ggplot(aes(price,y_pred)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of price') +
  ylab('Predicted value of price')+
  theme_bw()

ggplotly(plCatboost)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'